complete herbarium data

Author

Yi Liu

Published

June 2, 2024

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)

1 get herbarium data

raw <- read.csv("/Volumes/seas-zhukai/phenology/phenology_leaf_flower_lag/Herbarim_flower/combined_occ_img_downloaded.csv") %>% 
  dplyr::select(day, month, year, startDayOfYear, coordinateUncertaintyInMeters, decimalLongitude, decimalLatitude, filename_image, species, genus,family)

phenology <- read.csv("/Volumes/seas-zhukai/phenology/phenology_leaf_flower_lag/Herbarim_flower/phenology.csv") %>% 
  mutate(filename_image = gsub(".txt", "", file_name))

joint_data <- left_join(phenology,raw, by = "filename_image")

1.1 fillter flowering time

joint_data_flower <- joint_data %>% #116805
  filter(flower_one > 0 & flower_many > 0) %>% 
  filter(!is.na(startDayOfYear)) %>% 
  filter(year>=1895) %>% 
  dplyr::select(decimalLongitude, decimalLatitude, startDayOfYear, year, species, genus, family) %>%
  # delete family Pinaceae and Cupressaceae
  filter(family!="Pinaceae" & family!="Cupressaceae") %>%
  rename(lon = decimalLongitude, lat = decimalLatitude, doy = startDayOfYear) %>% 
  distinct()  # clear herbarium data for repeat file and repeat file with different phenology

# two repeated component:
# 1. completely the same ~300
# 2. same specimen (different name) with different phenology (as long as the flower is consistent, we will keep them) ~2000

2 extract the climate normality

library(raster)

# extract the climate normality
complete_period_raster <- raster("../data/prism/complete_period_springmean.tif")

joint_data_flower_normality <- joint_data_flower %>%
  dplyr::select(lat, lon) %>%
  distinct() %>%
  mutate(complete_period_temp = extract(complete_period_raster, cbind(lon, lat)))

# extract the climate anormality
# Initialize an empty data frame to store the results
joint_data_flower_anormality <- data.frame()

# Loop through the specified years
for (fo_year in 1895:2023) {
  # Load the yearly raster file
  yearly_raster <- raster(paste0("../data/prism/", fo_year, "_springmean.tif"))
  
  # Process the joint_data_flower for the current year
  yearly_data <- joint_data_flower %>%
    dplyr::select(year, lat, lon) %>%
    distinct() %>%
    filter(year == fo_year) %>%
    mutate(yearly_temp = extract(yearly_raster, cbind(lon, lat)))
  
  # Append the yearly data to the cumulative data frame
  joint_data_flower_anormality <- rbind(joint_data_flower_anormality, yearly_data)
}

# Combine the normality and anormality data
temperature_data <- joint_data_flower_normality %>%
  right_join(joint_data_flower_anormality, by = c("lat", "lon")) %>%
  rename(norm = complete_period_temp, yeart = yearly_temp) %>%
  mutate(anom = yeart - norm) %>%
  right_join(joint_data_flower, by = c("lat", "lon", "year")) %>%
  filter(!is.na(anom)) 

write.csv(temperature_data, "../data/herb_temperature_data.csv")

3 fit the model

3.1 remove outlier

library(mvoutlier)
Loading required package: sgeostat
temperature_data <- read.csv("../data/herb_temperature_data.csv")

temperature_data_clean <- temperature_data %>%
  group_by(species) %>%
  filter(n() > 30) %>%
  group_modify(~ {
    data_matrix <- dplyr::select(.x, yeart, doy) %>% as.matrix()
    outlier_result <- aq.plot(data_matrix)
    .x %>% mutate(outliers = outlier_result$outliers)
  }) %>%
  ungroup()

3.2 keep record of number of observation deleted and left

temperature_data_clean %>% 
  group_by(species) %>%
  summarise(total = n(),
            num_outliers = sum(outliers),
            delete_p = num_outliers/total) %>%
  arrange(desc(delete_p))
# A tibble: 74 × 4
   species               total num_outliers delete_p
   <chr>                 <int>        <int>    <dbl>
 1 Quercus sadleriana       61           20    0.328
 2 Betula lenta            188           58    0.309
 3 Acer tataricum           59           17    0.288
 4 Betula papyrifera       190           54    0.284
 5 Fraxinus americana      181           50    0.276
 6 Betula alleghaniensis   145           35    0.241
 7 Betula populifolia      108           24    0.222
 8 Quercus durata           61           13    0.213
 9 Quercus john-tuckeri     38            8    0.211
10 Quercus stellata         87           18    0.207
# ℹ 64 more rows

3.3 get data for model by requiring 10 observations for each parameter

temperature_data_model <- temperature_data_clean %>%
  filter(!outliers) %>%  # Correctly referencing the outliers column
  group_by(species) %>%
  filter(n_distinct(doy, anom, norm) > 30) %>%  # Use n_distinct() for distinct counting
  ungroup()

3.4 fit and plot the model

source("../scripts/function_visionalize_summmary_MLmodel.R")

# Apply the function to each species and store the results
results <- list()
unique_species <- unique(temperature_data_model$species)

for (species_name in unique_species) {
  results[[species_name]] <- analyze_species(temperature_data_model, species_name)
}

# Combine all summary rows into a single data frame
summary_results <- bind_rows(lapply(results, function(res) res$summary))

# Save all plots to a single PDF file
pdf("../data/species_plots_herb.pdf", width = 8, height = 6)
for (species_name in unique_species) {
  print(results[[species_name]]$plot)
}
dev.off()

write.csv(summary_results, "../species_summary_herb.csv", row.names = FALSE)

4 get spices composition

library(data.tree)
# Example data
species_data <- temperature_data_model %>%
  group_by(species, genus, family) %>%
  summarise(specimen_count = n())
`summarise()` has grouped output by 'species', 'genus'. You can override using
the `.groups` argument.
# Aggregate data at genus level
genus_data <- species_data %>%
  group_by(family, genus) %>%
  summarise(specimen_count = sum(specimen_count))
`summarise()` has grouped output by 'family'. You can override using the
`.groups` argument.
# Aggregate data at family level
family_data <- species_data %>%
  group_by(family) %>%
  summarise(specimen_count = sum(specimen_count))

# Total specimen count
total_specimen_count <- sum(species_data$specimen_count)

# Create hierarchical data
hierarchical_data <- species_data %>%
  mutate(level = "species") %>%
  bind_rows(genus_data %>% mutate(species = "", level = "genus")) %>%
  bind_rows(family_data %>% mutate(genus = "", species = "", level = "family")) %>%
  ungroup() %>%
  arrange(family, genus, species)

# Create a data tree structure
tree_data <- hierarchical_data %>%
  mutate(pathString = paste("Total", family, genus, species, sep = "/")) %>%
  dplyr::select(pathString, specimen_count)

tree <- as.Node(tree_data)
tree$specimen_count <- total_specimen_count

# Print the tree structure to console
print(tree, "specimen_count")
                            levelName specimen_count
1  Total                                        7338
2   ¦--Betulaceae                               1027
3   ¦   °--Betula                               1027
4   ¦       ¦--Betula alleghaniensis             110
5   ¦       ¦--Betula glandulosa                  64
6   ¦       ¦--Betula lenta                      130
7   ¦       ¦--Betula nigra                       95
8   ¦       ¦--Betula occidentalis               162
9   ¦       ¦--Betula papyrifera                 136
10  ¦       ¦--Betula populifolia                 84
11  ¦       ¦--Betula pumila                     207
12  ¦       °--Betula sandbergii                  39
13  ¦--Fagaceae                                 1845
14  ¦   °--Quercus                              1845
15  ¦       ¦--Quercus agrifolia                 207
16  ¦       ¦--Quercus alba                       75
17  ¦       ¦--Quercus berberidifolia             69
18  ¦       ¦--Quercus chrysolepis               201
19  ¦       ¦--Quercus durata                     48
20  ¦       ¦--Quercus ellipsoidalis              34
21  ¦       ¦--Quercus falcata                    41
22  ¦       ¦--Quercus gambelii                  120
23  ¦       ¦--Quercus garryana                   50
24  ¦       ¦--Quercus ilicifolia                 71
25  ¦       ¦--Quercus kelloggii                  64
26  ¦       ¦--Quercus lobata                     37
27  ¦       ¦--Quercus macrocarpa                 81
28  ¦       ¦--Quercus marilandica                67
29  ¦       ¦--Quercus nigra                      45
30  ¦       ¦--Quercus parvula                    37
31  ¦       ¦--Quercus prinoides                  33
32  ¦       ¦--Quercus rubra                      55
33  ¦       ¦--Quercus sadleriana                 41
34  ¦       ¦--Quercus stellata                   69
35  ¦       ¦--Quercus turbinella                 58
36  ¦       ¦--Quercus vacciniifolia              63
37  ¦       ¦--Quercus velutina                   52
38  ¦       ¦--Quercus virginiana                 46
39  ¦       °--Quercus wislizeni                 181
40  ¦--Moraceae                                  248
41  ¦   °--Morus                                 248
42  ¦       ¦--Morus alba                        111
43  ¦       ¦--Morus microphylla                  45
44  ¦       °--Morus rubra                        92
45  ¦--Oleaceae                                  775
46  ¦   °--Fraxinus                              775
47  ¦       ¦--Fraxinus americana                131
48  ¦       ¦--Fraxinus anomala                  123
49  ¦       ¦--Fraxinus cuspidata                 55
50  ¦       ¦--Fraxinus dipetala                 113
51  ¦       ¦--Fraxinus pennsylvanica            240
52  ¦       °--Fraxinus velutina                 113
53  ¦--Salicaceae                                807
54  ¦   °--Populus                               807
55  ¦       ¦--Populus angustifolia               75
56  ¦       ¦--Populus balsamifera                35
57  ¦       ¦--Populus deltoides                 237
58  ¦       ¦--Populus fremontii                 126
59  ¦       ¦--Populus grandidentata             101
60  ¦       °--Populus tremuloides               233
61  ¦--Sapindaceae                              2086
62  ¦   °--Acer                                 2086
63  ¦       ¦--Acer circinatum                    45
64  ¦       ¦--Acer glabrum                      172
65  ¦       ¦--Acer macrophyllum                 225
66  ¦       ¦--Acer negundo                      464
67  ¦       ¦--Acer pensylvanicum                181
68  ¦       ¦--Acer platanoides                  106
69  ¦       ¦--Acer rubrum                       465
70  ¦       ¦--Acer saccharinum                   89
71  ¦       ¦--Acer saccharum                    130
72  ¦       ¦--Acer spicatum                     167
73  ¦       °--Acer tataricum                     42
74  °--Ulmaceae                                  550
75      °--Ulmus                                 550
76          ¦--Ulmus alata                        99
77          ¦--Ulmus americana                   265
78          ¦--Ulmus pumila                       49
79          °--Ulmus rubra                       137

5 compare spatial and temporal sensitivity

summary_results <- read.csv("../species_summary_herb.csv")

# plot a figure to show the comparasion of anom slope and norm lope with confidence interval and model fitting
ggplot(summary_results, aes(x = anom_estimate, y = norm_estimate)) +
  geom_errorbar(aes(ymin = norm_conf_low, ymax = norm_conf_high, alpha = r_squared), width = 0) +
  geom_errorbarh(aes(xmin = anom_conf_low, xmax = anom_conf_high, alpha = r_squared), height = 0) +
  geom_point(aes(alpha = r_squared), size = 3) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") +  # Add 1:1 line
  scale_alpha_continuous(range = c(0.2, 1)) +  # Adjust alpha range for better visualization
  labs(
    title = "Slope Estimates for Anom and Norm by Species",
    x = "Anom Slope Estimate",
    y = "Norm Slope Estimate",
    alpha = "R^2"
  ) +
  theme_minimal() +
  theme(legend.position = "right")

5.1 kick the r^2 less than 0.2

ggplot(summary_results %>% filter(r_squared>0.3), aes(x = anom_estimate, y = norm_estimate)) +
  geom_errorbar(aes(ymin = norm_conf_low, ymax = norm_conf_high, alpha = r_squared), width = 0) +
  geom_errorbarh(aes(xmin = anom_conf_low, xmax = anom_conf_high, alpha = r_squared), height = 0) +
  geom_point(aes(alpha = r_squared), size = 3) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") +  # Add 1:1 line
  scale_alpha_continuous(range = c(0.2, 1)) +  # Adjust alpha range for better visualization
  labs(
    title = "Slope Estimates for Anom and Norm by Species",
    x = "Anom Slope Estimate",
    y = "Norm Slope Estimate",
    alpha = "R^2"
  ) +
  theme_minimal() +
  theme(legend.position = "right")

5.2 check the not overlap species

to_check <- summary_results %>% 
#filter for the species that the anom slope and norm slope confidence interval do not overlap
  filter(anom_conf_low > norm_conf_high | anom_conf_high < norm_conf_low) %>%
  arrange(desc(r_squared))

print(to_check)
            species anom_estimate anom_conf_low anom_conf_high norm_estimate
1     Quercus nigra    2.43779652    -1.0419418       5.917535     -4.903699
2      Betula nigra   -0.01562399    -3.5261793       3.494931     -4.629899
3       Ulmus alata    1.72035976    -2.3772538       5.817973     -5.869936
4 Betula glandulosa    5.54789354     0.2870689      10.808718     -6.449175
  norm_conf_low norm_conf_high r_squared
1     -6.232752      -3.574645 0.5839906
2     -5.657750      -3.602047 0.4653512
3     -8.106634      -3.633238 0.2572838
4     -9.420004      -3.478347 0.2510565

5.3 whether they are false positive: apply Bonferroni correction

to_check_species <- temperature_data %>%
  filter(species %in% to_check$species)

result <- to_check_species %>%
  group_by(species) %>%
  do(broom::tidy(lm(doy ~ anom + norm, data = .), conf.int = TRUE)) %>%
  ungroup()

# Count the number of unique species
num_tests <- temperature_data %>% 
  pull(species) %>% 
  unique() %>% 
  length()

# Bonferroni-adjusted alpha
alpha <- 0.05
adjusted_alpha <- alpha / num_tests

# Calculate the Bonferroni-adjusted critical value for the t-distribution
# Assuming degrees of freedom is the same for each model; you can adjust this part if needed
df <- to_check_species %>% 
  group_by(species) %>% 
  summarize(df = lm(doy ~ anom + norm, data = .)$df.residual) %>%
  pull(df) %>%
  unique() %>%
  mean()

# Get the critical t-value
critical_value <- qt(1 - adjusted_alpha / 2, df)

# Adjust the confidence intervals
result <- result %>%
  mutate(
    conf.low.adjusted = estimate - critical_value * std.error,
    conf.high.adjusted = estimate + critical_value * std.error
  ) %>% 
  dplyr::select(species, term, estimate, conf.low.adjusted, conf.high.adjusted) %>%
  filter(term %in% c("anom", "norm")) %>% 
  tidyr::pivot_wider(names_from = term, values_from = c(estimate, conf.low.adjusted, conf.high.adjusted))
  
result %>%
    filter(conf.low.adjusted_anom > conf.high.adjusted_norm | conf.high.adjusted_anom < conf.low.adjusted_norm) 
# A tibble: 0 × 7
# ℹ 7 variables: species <chr>, estimate_anom <dbl>, estimate_norm <dbl>,
#   conf.low.adjusted_anom <dbl>, conf.low.adjusted_norm <dbl>,
#   conf.high.adjusted_anom <dbl>, conf.high.adjusted_norm <dbl>